Project description

Effect of cohesin looping on genome architecture, from the perspective of the nuclear lamina.

Introduction

In a previous documents, I prepared DamID objects for all bins and FPKM values for all genes. I will correlate these here.

Note that I tried many things in this document. It comes down to a very simple message: there is no enrichment for LAD genes upon CTCF / RAD21 perturbations.

Method

Load (z-scale) DamID tracks and FPKM values. Correlate these in various ways.

Set-up

Load the libraries and set the parameters.

# Load dependencies
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(caTools))
suppressPackageStartupMessages(library(yaml))
suppressPackageStartupMessages(library(GGally))

# Prepare output 
output.dir <- "ts220117_Correlating_Expression_and_NLinteractions"
dir.create(output.dir, showWarnings = FALSE)

# Load data
input.dir <- "ts220113_CTCF_enrichment_at_LAD_borders"
lads <- readRDS(file.path(input.dir, "LADs_pA.rds"))[[1]]
lads.border <- readRDS(file.path(input.dir, "LAD_borders_pA.rds"))[[1]]

input.dir <- "ts220113_effect_of_CTCF_depletion_on_LAD_borders"
bin.size <- readRDS(file.path(input.dir, "bin_size.rds"))
gr_damid <- readRDS(file.path(input.dir, "damid.rds"))
metadata <- readRDS(file.path(input.dir, "metadata_damid.rds"))

input.dir <- "ts220113_GeneExpression"
genes <- readRDS(file.path(input.dir, "genes.rds"))
genes.fpkm <- readRDS(file.path(input.dir, "genes_fpkm_mean.rds"))
genes.res <- readRDS(file.path(input.dir, "genes_results.rds"))

input.dir <- "ts220113_CTCF_sites_within_LADs"
genes_LAD_active <- readRDS(file.path(input.dir, 
                                      "genes_LAD_active.rds"))
genes_LAD_active_CTCFidx <- readRDS(file.path(input.dir, 
                                              "genes_LAD_active_CTCFidx.rds"))
genes_LAD_inactive <- readRDS(file.path(input.dir, 
                                        "genes_LAD_inactive.rds"))
genes_LAD_inactive_CTCFidx <- readRDS(file.path(input.dir,
                                                "genes_LAD_inactive_CTCFidx.rds"))

input.dir <- "ts220117_Positioning_DifferentialGenes"
significant_tests <- readRDS(file.path(input.dir, 
                                       "significant_tests.rds"))
genes_expression_idx <- readRDS(file.path(input.dir, 
                                          "genes_expression_idx.rds"))


# Also process the nascent expression data
input.dir <- "ts220117_NascentExpression"
#genes <- readRDS(file.path(input.dir, "genes.rds"))
genes.fpkm.nascent <- readRDS(file.path(input.dir, "genes_fpkm_mean.rds"))
genes.res.nascent <- readRDS(file.path(input.dir, "genes_results.rds"))


# Also get the LAD differences
input.dir <- "ts220113_DamID_changes_versus_LAD_size_and_score"
gr_LAD_consensus <- readRDS(file.path(input.dir, "LADs_consensus.rds"))

Prepare knitr.

library(knitr)
opts_chunk$set(fig.width = 10, fig.height = 4, 
               message = F, warning = F,
               dev=c('png', 'pdf'), fig.path = file.path(output.dir, "figures/")) 
pdf.options(useDingbats = FALSE)

Functions.

GenesDamIDScores <- function(genes, gr, extend = 1e4) {
  
  # Extend genes
  genes.ovl <- genes
  start(genes.ovl) <- start(genes.ovl) - extend
  end(genes.ovl) <- end(genes.ovl) + extend
  
  # Overlay genes with bins
  ovl <- findOverlaps(genes.ovl, gr)
  
  # Determine mean score per gene
  tib <- as_tibble(mcols(gr))[subjectHits(ovl), ] %>% 
    add_column(idx = queryHits(ovl)) %>%
    gather(key, value, -idx) %>%
    group_by(idx, key) %>%
    summarise(mean = mean(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(key = factor(key, levels = names(mcols(gr)))) %>%
    spread(key, mean)
    
  # Prepare genes with scores
  genes.damid <- genes
  mcols(genes.damid) <- tib %>% 
    dplyr::select(-idx)
  
  genes.damid
  
}

1. DamID score for genes

First, calculate DamID score for every gene.

# Overlay genes with LADs and add distance to border
genes$LAD <- overlapsAny(genes, lads, ignore.strand = T)
dis <- distanceToNearest(genes, lads.border, ignore.strand = T)
genes$LAD_distance <- mcols(dis)$distance

# Score per gene
genes.damid <- GenesDamIDScores(genes, gr_damid)

2. Correlate change in expression with change in DamID score

Compare changes in LaminB1 scores with changes in gene expression.

# Create various plots between a base sample and other samples
PlotExpressionVsDamID <- function(genes, res, expr, damid, base, samples,
                                  min_expr = -1) {
  
  # Combine into one tibble
  tib <- tibble(gene_id = genes$gene_id,
                LAD = genes$LAD,
                LAD_distance = genes$LAD_distance,
                expr_base = expr %>% pull(base),
                damid_base = as_tibble(mcols(damid)) %>% pull(base)) %>%
    mutate(LAD = LAD & (LAD_distance > 0)) %>%
    bind_cols(expr %>% 
                select(samples) %>% 
                rename_all(function(x) paste0("expr_", samples))) %>%
    bind_cols(as_tibble(mcols(damid)) %>% 
                select(samples) %>% 
                rename_all(function(x) paste0("damid_", samples))) %>%
    mutate_at(vars(paste0("expr_", samples)), 
              function(x) log2(x+1) - log2(.$expr_base+1)) %>%
    mutate_at(vars(paste0("damid_", samples)), 
              function(x) x - .$damid_base)
  
  # Get differential results
  sign <- lapply(samples, 
                 function(x) { mcols(res)[, paste(x, "vs", base, "sign", 
                                                  sep = "_")]})
  
  # Gather
  idx <- rowSums(expr > min_expr) > 0
  sign <- factor(unlist(sign), levels = c("down", "stable", "up"))[idx]
  
  tib_gather <- tib[idx, ] %>%
    dplyr::select(-contains("base")) %>%
    gather(key_expr, expr, contains("expr")) %>%
    gather(key_damid, damid, contains("damid")) %>%
    mutate(key_expr = str_remove(key_expr, "expr_"),
           key_damid = str_remove(key_damid, "damid_")) %>%
    filter(key_damid == key_expr) %>%
    mutate(key_damid = factor(key_damid, levels = samples),
           sign = sign)
    
  # Plot
  plt <- tib_gather %>%
    ggplot(aes(x = damid, y = expr)) +
    geom_point(aes(col = sign, alpha = sign)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    geom_smooth(method = "lm", col = "red") +
    facet_grid(LAD ~ key_damid) +
    scale_color_manual(values = c("blue", "black", "red")) +
    scale_alpha_manual(values = c(0.5, 0.05, 0.5)) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Plot LAD & latest time point
  tib_filter <- tib_gather %>%
    filter(LAD == T, key_damid == tail(tib_gather$key_damid)[1]) %>%
    drop_na()
  
  plt <- tib_filter %>%
    ggplot(aes(x = damid, y = expr)) +
    geom_point(aes(col = sign, size = sign)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "black") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
    scale_color_manual(values = c("blue", "black", "red"), 
                       labels = c(paste0("Down (n=", sum(tib_filter$sign == "down"), ")"),
                                  paste0("Stable (n=", sum(tib_filter$sign == "stable"), ")"),
                                  paste0("Up (n=", sum(tib_filter$sign == "up"), ")"))) +
    scale_size_manual(values = c(1, 0.25, 1), guide = F) +
    facet_grid(. ~ key_damid) +
    ggtitle(paste0("Pearson=",
                   round(cor(tib_filter$expr, tib_filter$damid), 3),
                   "; p-value=",
                   signif(cor.test(tib_filter$expr, tib_filter$damid)$p.value, 2))) +
    xlab("Delta DamID") +
    ylab("Delta Expr") +
    coord_cartesian(xlim = c(-2, 2), ylim = c(-3, 6)) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Plot density
  plt <- tib_gather %>%
    ggplot(aes(x = damid)) +
    geom_density(aes(col = sign)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    facet_grid(LAD ~ key_damid) +
    scale_color_manual(values = c("blue", "black", "red")) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  plt <- tib_gather %>%
    ggplot(aes(x = damid)) +
    stat_ecdf(aes(col = sign)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    facet_grid(LAD ~ key_damid) +
    scale_color_manual(values = c("blue", "black", "red")) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Plot without stable group
  plt <- tib_gather %>%
    filter(sign != "stable") %>%
    ggplot(aes(x = damid, y = expr)) +
    geom_point(aes(col = sign, alpha = sign)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    #geom_smooth(method = "lm", col = "red") +
    facet_grid(LAD ~ key_damid) +
    scale_color_manual(values = c("blue", "red")) +
    scale_alpha_manual(values = c(0.5, 0.5)) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Plot only changing NL genes
  plt <- tib_gather %>%
    filter(damid < -0.5 | damid > 0.5) %>%
    ggplot(aes(x = damid, y = expr)) +
    geom_point(size = 0.5) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    #geom_smooth(method = "lm", col = "red") +
    facet_grid(. ~ key_damid) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Plot density
  plt <- tib_gather %>%
    mutate(damid_group = case_when(damid < -0.5 ~ "NL_lower",
                                   damid > 0.5 ~ "NL_higher",
                                   T ~ "NL_stable"),
           damid_group = factor(damid_group,
                                levels = c("NL_lower", "NL_stable", "NL_higher"))) %>%
    ggplot(aes(x = expr)) +
    geom_density(alpha = 0.2, aes(fill = damid_group)) +
    geom_vline(xintercept = 0, linetype = "dashed", col = "blue") +
    geom_hline(yintercept = 0, linetype = "dashed", col = "blue") +
    facet_grid(. ~ key_damid) +
    scale_fill_manual(values = c("blue", "black", "red")) +
    coord_cartesian(xlim = c(-0.5, 0.5), ylim = c(0, 30)) +
    theme_bw() +
    theme(aspect.ratio = 1)
  plot(plt)
  
  # Print correlation between NL interactions and expression
  tib_corr <- tib_gather %>%
    drop_na() %>%
    filter(LAD == T) %>%
    group_by(key_damid) %>%
    summarise(cor = cor(expr, damid, method = "pearson"),
              pvalue = cor.test(expr, damid, method = "pearson")$p.value,
              n = n()) %>%
    ungroup() %>%
    mutate(sign = pvalue < 0.01)
  
  plt <- tib_corr %>%
    ggplot(aes(x = key_damid, y = cor, fill = sign)) +
    geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
    geom_bar(stat = "identity") +
    xlab("") +
    ylab("Pearson correlation") +
    scale_fill_grey() +
    theme_bw() +
    theme(aspect.ratio = 2,
          axis.text.x = element_text(angle = 90, hjust = 1))
  plot(plt)
  
  tib_corr
    
  
  # # Also, determine ratio up/down in iLADs and in LADs
  # tib_ratio <- tib_gather %>%
  #   group_by(key_damid, LAD) %>%
  #   summarise(down = mean(sign == "down"),
  #             up = mean(sign == "up")) %>%
  #   ungroup() %>%
  #   gather(key_sign, fraction, down, up)
  # 
  # plt <- tib_ratio %>%
  #   ggplot(aes(x = LAD, y = fraction, fill = key_sign)) +
  #   geom_bar(stat = "identity", position = "dodge") +
  #   #facet_wrap( ~ key_damid, scales = "free_y") +
  #   facet_grid(. ~ key_damid) +
  #   scale_fill_grey() +
  #   theme_classic() +
  #   theme(aspect.ratio = 3)
  # plot(plt)
  # 
  # # Repeat, the other way around
  # tib_ratio <- tib_gather %>%
  #   group_by(key_damid, sign) %>%
  #   summarise(mean = mean(LAD)) %>%
  #   ungroup()
  # 
  # plt <- tib_ratio %>%
  #   ggplot(aes(x = key_damid, y = mean, fill = sign)) +
  #   geom_bar(stat = "identity", position = "dodge") +
  #   xlab("") +
  #   ylab("Fraction in LAD") +
  #   scale_fill_manual(values = c("blue", "darkgrey", "red")) +
  #   theme_classic() +
  #   theme(aspect.ratio = 1)
  # plot(plt)
  
}

PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "CTCFEL_0h", 
                      samples = c("CTCFEL_6h", "CTCFEL_24h", "CTCFEL_96h"))

## # A tibble: 3 × 5
##   key_damid       cor   pvalue     n sign 
##   <fct>         <dbl>    <dbl> <int> <lgl>
## 1 CTCFEL_6h  -0.0298  3.37e- 2  5064 FALSE
## 2 CTCFEL_24h -0.00792 5.73e- 1  5056 FALSE
## 3 CTCFEL_96h -0.143   1.09e-24  5064 TRUE
PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "WAPL_0h", 
                      samples = c("WAPL_6h", "WAPL_24h", "WAPL_96h"))

## # A tibble: 3 × 5
##   key_damid      cor    pvalue     n sign 
##   <fct>        <dbl>     <dbl> <int> <lgl>
## 1 WAPL_6h   -0.00413 0.769      5075 FALSE
## 2 WAPL_24h   0.0381  0.00660    5088 TRUE 
## 3 WAPL_96h  -0.0602  0.0000171  5087 TRUE
PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "CTCFWAPL_0h", 
                      samples = c("CTCFWAPL_6h", "CTCFWAPL_24h", "CTCFWAPL_96h"))

## # A tibble: 3 × 5
##   key_damid          cor   pvalue     n sign 
##   <fct>            <dbl>    <dbl> <int> <lgl>
## 1 CTCFWAPL_6h   0.000627 9.64e- 1  5078 FALSE
## 2 CTCFWAPL_24h  0.0406   3.85e- 3  5074 TRUE 
## 3 CTCFWAPL_96h -0.120    7.46e-18  5083 TRUE
PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "RAD21_0h", 
                      samples = c("RAD21_6h", "RAD21_24h"))

## # A tibble: 2 × 5
##   key_damid     cor   pvalue     n sign 
##   <fct>       <dbl>    <dbl> <int> <lgl>
## 1 RAD21_6h  -0.0113 4.21e- 1  5050 FALSE
## 2 RAD21_24h -0.0990 1.68e-12  5056 TRUE
# Filtered for expr > 2.5 (as previous document)
tib_corr_ctcf <- PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "CTCFEL_0h", 
                      samples = c("CTCFEL_6h", "CTCFEL_24h", "CTCFEL_96h"),
                      min_expr = 2.5)

tib_corr_wapl <- PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "WAPL_0h", 
                      samples = c("WAPL_6h", "WAPL_24h", "WAPL_96h"),
                      min_expr = 2.5)

tib_corr_ctcfwapl <- PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "CTCFWAPL_0h", 
                      samples = c("CTCFWAPL_6h", "CTCFWAPL_24h", "CTCFWAPL_96h"),
                      min_expr = 2.5)

tib_corr_rad21 <- PlotExpressionVsDamID(genes, genes.res, genes.fpkm, genes.damid, 
                      base = "RAD21_0h", 
                      samples = c("RAD21_6h", "RAD21_24h"),
                      min_expr = 2.5)

# Combine correlations in one final plot
tib_corr <- bind_rows(tib_corr_ctcf,
                      tib_corr_wapl,
                      tib_corr_ctcfwapl,
                      tib_corr_rad21) %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05) %>%
  # filter for "significant" tests only
  filter(key_damid %in% str_remove(significant_tests, "_vs.*")) %>%
  separate(key_damid, c("target", "timepoint"), remove = F) %>%
  mutate(target = str_remove(target, "EL"),
         target = factor(target, c("CTCF", "RAD21", "WAPL", "CTCFWAPL")),
         timepoint = factor(timepoint, c("24h", "48h", "96h"))) %>%
  arrange(target, timepoint) %>%
  mutate(key_damid = factor(key_damid, unique(key_damid)))
  

# Plot
tib_corr %>%
  ggplot(aes(x = key_damid, y = cor, fill = sign)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("grey80", "grey30")) +
  xlab("") +
  ylab("Pearson correlation") +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

Many plots. As expected, there is correlation between changes in expression and changes in nuclear positioning. This is normal.

Plot the DamID changes for differentially expressed genes. This is another way to show the same thing: differentially expressed genes also changes their lamina positioning.

# 1) Determine DamID changes with t=0h
tib_damid_changes <- as_tibble(mcols(genes.damid)) %>%
  mutate(CTCFEL_96h = CTCFEL_96h - CTCFEL_0h,
         RAD21_24h = RAD21_24h - RAD21_0h,
         WAPL_96h = WAPL_96h - WAPL_0h,
         CTCFWAPL_24h = CTCFWAPL_24h - CTCFWAPL_0h,
         CTCFWAPL_96h = CTCFWAPL_96h - CTCFWAPL_0h,
         gene = 1:nrow(.)) %>%
  dplyr::select(CTCFEL_96h, RAD21_24h, WAPL_96h,
                CTCFWAPL_24h, CTCFWAPL_96h, gene) %>%
  gather(key, damid, -gene)


# 2) Get significant changes
tib_genes_sign <- as_tibble(genes.res) %>%
  dplyr::select(matches(paste(significant_tests, collapse = "|"))) %>%
  dplyr::select(contains("sign")) %>%
  dplyr::select(-contains("48h")) %>%
  mutate(gene = 1:nrow(.)) %>%
  gather(key, gene_sign, -gene) %>%
  mutate(key = str_remove(key, "_vs.*"))


# 3) Join the two and filter for selected genes (expression > cutoff)
tib_combined <- full_join(tib_genes_sign, tib_damid_changes) %>%
  #filter(gene %in% which(genes_expression_idx)) %>%
  separate(key, c("target", "timepoint"), remove = F) %>%
  mutate(target = str_remove(target, "EL"),
         target = factor(target, c("CTCF", "RAD21", "WAPL", "CTCFWAPL")),
         timepoint = factor(timepoint, c("24h", "48h", "96h"))) %>%
  arrange(gene, target, timepoint) %>%
  mutate(key = factor(key, unique(key))) 


# LAD genes only?
lad_idx <- which(overlapsAny(genes.damid, gr_LAD_consensus,
                             ignore.strand = T))
#lad_idx <- which(rowSums(as_tibble(mcols(genes.damid)) > 0) > 0)

tib_combined <- tib_combined %>% 
  filter(gene %in% lad_idx)


# 4) Plot
tib_combined %>%
  ggplot(aes(x = target, y = damid, fill = gene_sign)) +
  geom_boxplot(outlier.shape = NA, position = "dodge") +
  geom_hline(yintercept = 0, col = "grey30", linetype = "dashed") +
  facet_grid(. ~ timepoint, scales = "free_x", space = "free") +
  xlab("") +
  ylab("pA-DamID difference with t=0h") +
  scale_fill_manual(values = c("blue", "grey50", "red"),
                    name = "Class") +
  coord_cartesian(ylim = c(-1.3, 1.3)) +
  theme_bw() +
  theme(#aspect.ratio = 2/3,
        axis.text.x = element_text(angle = 90, hjust = 1))

# 5) Calculate statistics for illustrator (lazy approach)
tib_pvalues <- tibble()

for (tmp_key in unique(tib_combined$key)) {
  
  tmp <- tib_combined %>%
    filter(key == tmp_key)
  
  for (tmp_sign in c("up", "down")) {
    
    test <- wilcox.test(tmp$damid[tmp$gene_sign == "stable"], 
                        tmp$damid[tmp$gene_sign == tmp_sign],
                        conf.int = TRUE)
    
    tib_pvalues <- bind_rows(tib_pvalues,
                             tibble(key = tmp_key, 
                                    gene_sign = tmp_sign,
                                    n_sign = sum(tmp$gene_sign == tmp_sign),
                                    pvalue = test$p.value,
                                    direction = ifelse(test$estimate > 0, 
                                                       "up", "down")))
    
  }
}

# Multiple testing
tib_pvalues %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05) %>%
  print(n = 40)
## # A tibble: 10 × 7
##    key          gene_sign n_sign   pvalue direction     padj sign 
##    <chr>        <chr>      <int>    <dbl> <chr>        <dbl> <lgl>
##  1 CTCFEL_96h   up           405 1.64e-23 up        1.48e-22 TRUE 
##  2 CTCFEL_96h   down          73 5.17e- 6 down      3.10e- 5 TRUE 
##  3 RAD21_24h    up           183 4.58e- 7 up        3.67e- 6 TRUE 
##  4 RAD21_24h    down          23 8.82e- 2 down      3.53e- 1 FALSE
##  5 WAPL_96h     up           514 7.53e- 7 up        5.27e- 6 TRUE 
##  6 WAPL_96h     down          91 6.34e- 1 down      6.35e- 1 FALSE
##  7 CTCFWAPL_24h up            29 3.17e- 1 down      6.35e- 1 FALSE
##  8 CTCFWAPL_24h down          21 2.79e- 3 down      1.39e- 2 TRUE 
##  9 CTCFWAPL_96h up           841 2.22e-28 up        2.22e-27 TRUE 
## 10 CTCFWAPL_96h down         406 1.90e- 1 down      5.70e- 1 FALSE

Yes. As I said. Do the same thing for nascent expression changes.

# 1) Determine DamID changes with t=0h
tib_damid_changes <- as_tibble(mcols(genes.damid)) %>%
  mutate(WAPL_6h = WAPL_6h - WAPL_0h,
         WAPL_24h = WAPL_24h - WAPL_0h,
         gene = 1:nrow(.)) %>%
  dplyr::select(WAPL_6h, WAPL_24h, gene) %>%
  gather(key, damid, -gene)

# 2) Get significant changes
tib_genes_sign <- as_tibble(genes.res.nascent) %>%
  dplyr::select(contains("sign")) %>%
  mutate(gene = 1:nrow(.)) %>%
  gather(key, gene_sign, -gene) %>%
  mutate(key = str_remove(key, "_vs.*")) 

# 3) Join the two and filter for selected genes (expression > cutoff)
tib_combined <- full_join(tib_genes_sign, tib_damid_changes) %>%
  #filter(gene %in% which(genes_expression_idx)) %>%
  separate(key, c("target", "timepoint"), remove = F) %>%
  mutate(target = factor(target, "WAPL"),
         timepoint = factor(timepoint, c("6h" ,"24h"))) %>%
  arrange(gene, target, timepoint) %>%
  mutate(key = factor(key, unique(key))) 

# LAD genes only?
tib_combined <- tib_combined %>%
  filter(gene %in% lad_idx)

# 4) Plot
tib_combined %>%
  ggplot(aes(x = key, y = damid, fill = gene_sign)) +
  geom_boxplot(outlier.shape = NA, position = "dodge") +
  geom_hline(yintercept = 0, col = "grey30", linetype = "dashed") +
  xlab("") +
  ylab("pA-DamID difference with t=0h") + 
  scale_fill_manual(values = c("blue", "grey50", "red"),
                    name = "Class") +
  coord_cartesian(ylim = c(-1, 1)) +
  theme_bw() +
  theme(aspect.ratio = 1,
        axis.text.x = element_text(angle = 90, hjust = 1))

# 5) Calculate statistics for illustrator (lazy approach)
tib_pvalues <- tibble()

for (tmp_key in unique(tib_combined$key)) {
  
  tmp <- tib_combined %>%
    filter(key == tmp_key)
  
  for (tmp_sign in c("up", "down")) {
    
    test <- wilcox.test(tmp$damid[tmp$gene_sign == "stable"], 
                        tmp$damid[tmp$gene_sign == tmp_sign],
                        conf.int = TRUE)
    
    tib_pvalues <- bind_rows(tib_pvalues,
                             tibble(key = tmp_key, 
                                    gene_sign = tmp_sign,
                                    n_sign = sum(tmp$gene_sign == tmp_sign),
                                    pvalue = test$p.value,
                                    direction = ifelse(test$estimate > 0, 
                                                       "up", "down")))
    
  }
}

# Multiple testing 
tib_pvalues %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05) %>%
  print(n = 40)
## # A tibble: 4 × 7
##   key      gene_sign n_sign pvalue direction  padj sign 
##   <chr>    <chr>      <int>  <dbl> <chr>     <dbl> <lgl>
## 1 WAPL_6h  up            11  0.720 up            1 FALSE
## 2 WAPL_6h  down           9  0.767 down          1 FALSE
## 3 WAPL_24h up            92  0.476 down          1 FALSE
## 4 WAPL_24h down          30  0.347 down          1 FALSE

Here, the picture is less clear. It seems that other effects (i.e. CTCF detachment) are stronger than the transcription effect on lamina interactions.

3. LAD genes near CTCF in RAD21

Skipped.

4. Correlating LAD changes versus differential genes

I want to show that differential genes are not causing the LAD changes I observed previously. This is the main thing that I want to convey. But how to show this?

To do this, I will determine the change in LAD score for LADs with up and downregulated genes. If the LAD score is not correlated with the presence of transcriptional differences, I can conclude that transcription is not causing the change in nuclear positioning.

# Get the results
significant_tests_filtered <- grep("48h", significant_tests, 
                                   invert = T, value = T)

tib_results <- as_tibble(mcols(genes.res)) %>% 
  dplyr::select(contains("sign")) %>% 
  dplyr::select(matches(paste(significant_tests_filtered, collapse = "|"))) %>%
  dplyr::rename_all(function(x) str_remove(x, "_vs.*"))


# Count number of genes per differential LAD
tib <- as_tibble(mcols(gr_LAD_consensus)) %>%
  mutate(gene_total = countOverlaps(gr_LAD_consensus, genes.res,
                                    ignore.strand = T))

tib_combined <- tibble()

for (t in names(tib_results)) {
  
  # Determine the number of up and downregulated genes - and a "LAD summary"
  tib <- tib %>%
    bind_cols(tibble(up = countOverlaps(gr_LAD_consensus, 
                                        genes.res[tib_results %>% pull(t) == "up"],
                                        ignore.strand = T),
                     down = countOverlaps(gr_LAD_consensus, 
                                          genes.res[tib_results %>% pull(t) == "down"],
                                          ignore.strand = T)) %>%
                mutate(class = case_when((up > 0) & (down > 0) ~ "both",
                                         up > down ~ "up",
                                         down > up ~ "down",
                                         #(up > 0) & (down > 0) ~ "both",
                                         T ~ "stable"),
                       class = factor(class, c("down", "stable", "up", "both")),
                       diff = tib %>% pull(t) -
                         tib %>% pull(str_replace(t, "_.*", "_0h"))) %>%
                dplyr::rename_all(function(x) paste(t, x, sep = "_")))
  
  # Plot immediately
  plt <- tib %>%
    dplyr::rename_all(function(x) str_remove(x, paste0(t, "_"))) %>%
    ggplot(aes(x = class, y = diff, fill = class)) +
    geom_boxplot(outlier.shape = NA) +
    ggtitle(t) +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  plot(plt)
  
  # Add to tib combined with all the details
  tib_combined <- bind_rows(tib_combined,
                            tib %>%
                              dplyr::rename_all(function(x) str_remove(x, paste0(t, "_"))) %>%
                              dplyr::select(class, diff, up, down) %>%
                              mutate(test = t))
                            
}

# Remove LADs with up and downregulated genes
tib_combined <- tib_combined %>%
  filter(class != "both")
  
    
# Plot combined
tib_combined %>%
  separate(test, c("condition", "timepoint"), remove = F) %>%
  mutate(condition = factor(condition, levels(metadata$condition)),
         timepoint = factor(timepoint, levels(metadata$timepoint))) %>%
  arrange(condition, timepoint) %>%
  ggplot(aes(x = condition, y = diff, fill = class)) +
  geom_boxplot(outlier.shape = NA, position = "dodge") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  facet_grid(. ~ timepoint, scales = "free_x", space = "free") +
  coord_cartesian(ylim = c(-1.4, 1.4)) +
  xlab("") +
  ylab("LAD difference") +
  scale_fill_manual(values = c("blue", "grey50", "red"),
                    name = "Class") +
  theme_bw() +
  theme(# aspect.ratio = 2/3,
          axis.text.x = element_text(angle = 90, hjust = 1))

# Calculate statistics for illustrator (lazy approach)
tib_pvalues <- tibble()

for (tmp_key in unique(tib_combined$test)) {
  
  tmp <- tib_combined %>%
    filter(test == tmp_key)
  
  for (tmp_sign in c("up", "down")) {
    
    test <- wilcox.test(tmp$diff[tmp$class == "stable"], 
                        tmp$diff[tmp$class == tmp_sign],
                        conf.int = TRUE)
    
    tib_pvalues <- bind_rows(tib_pvalues,
                             tibble(key = tmp_key, 
                                    class = tmp_sign,
                                    n_sign = sum(tmp$class == tmp_sign),
                                    pvalue = test$p.value,
                                    direction = ifelse(test$estimate > 0, 
                                                       "up", "down")))
    
  }
}

# Multiple testing 
tib_pvalues %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05) %>%
  print(n = 40)
## # A tibble: 10 × 7
##    key          class n_sign   pvalue direction     padj sign 
##    <chr>        <chr>  <int>    <dbl> <chr>        <dbl> <lgl>
##  1 CTCFEL_96h   up       238 7.62e- 1 up        1   e+ 0 FALSE
##  2 CTCFEL_96h   down      47 1.88e- 5 down      1.69e- 4 TRUE 
##  3 WAPL_96h     up       273 2.82e- 3 up        1.97e- 2 TRUE 
##  4 WAPL_96h     down      43 1.15e- 2 up        6.89e- 2 FALSE
##  5 CTCFWAPL_24h up        28 1.48e- 1 up        7.42e- 1 FALSE
##  6 CTCFWAPL_24h down      20 9.66e- 1 up        1   e+ 0 FALSE
##  7 CTCFWAPL_96h up       317 1.07e-17 up        1.07e-16 TRUE 
##  8 CTCFWAPL_96h down     100 4.29e- 1 down      1   e+ 0 FALSE
##  9 RAD21_24h    up       153 1.60e- 3 up        1.28e- 2 TRUE 
## 10 RAD21_24h    down      19 8.33e- 1 down      1   e+ 0 FALSE

Good. As hoped, most LAD changes are not correlated with the presence of differentially expressed genes.

Repeat for nascent transcription.

# Get the results
tib_results <- as_tibble(mcols(genes.res.nascent)) %>% 
  dplyr::select(contains("sign")) %>% 
  dplyr::rename_all(function(x) str_remove(x, "_vs.*"))


# Count number of genes per differential LAD
tib <- as_tibble(mcols(gr_LAD_consensus)) %>%
  mutate(gene_total = countOverlaps(gr_LAD_consensus, genes.res.nascent,
                                    ignore.strand = T))

tib_combined <- tibble()

for (t in names(tib_results)) {
  
  # Determine the number of up and downregulated genes - and a "LAD summary"
  tib <- tib %>%
    bind_cols(tibble(up = countOverlaps(gr_LAD_consensus, 
                                        genes.res.nascent[tib_results %>% pull(t) == "up"],
                                        ignore.strand = T),
                     down = countOverlaps(gr_LAD_consensus, 
                                          genes.res.nascent[tib_results %>% pull(t) == "down"],
                                          ignore.strand = T)) %>%
                mutate(class = case_when((up > 0) & (down > 0) ~ "both",
                                         up > down ~ "up",
                                         down > up ~ "down",
                                         #(up > 0) & (down > 0) ~ "both",
                                         T ~ "stable"),
                       class = factor(class, c("down", "stable", "up", "both")),
                       diff = tib %>% pull(t) -
                         tib %>% pull(str_replace(t, "_.*", "_0h"))) %>%
                dplyr::rename_all(function(x) paste(t, x, sep = "_")))
  
  # Plot immediately
  plt <- tib %>%
    dplyr::rename_all(function(x) str_remove(x, paste0(t, "_"))) %>%
    ggplot(aes(x = class, y = diff, fill = class)) +
    geom_boxplot(outlier.shape = NA) +
    ggtitle(t) +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  plot(plt)
  
  # Add to tib combined with all the details
  tib_combined <- bind_rows(tib_combined,
                            tib %>%
                              dplyr::rename_all(function(x) str_remove(x, paste0(t, "_"))) %>%
                              dplyr::select(class, diff, up, down) %>%
                              mutate(test = t))
                            
}

# Remove LADs with up and downregulated genes
tib_combined <- tib_combined %>%
  filter(class != "both")



# Plot combined
tib_combined %>%
  filter(class != "both") %>%
  separate(test, c("condition", "timepoint"), remove = F) %>%
  mutate(condition = factor(condition, levels(metadata$condition)),
         timepoint = factor(timepoint, levels(metadata$timepoint))) %>%
  arrange(condition, timepoint) %>%
  ggplot(aes(x = timepoint, y = diff, fill = class)) +
  geom_boxplot(outlier.shape = NA, position = "dodge") +
  geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
  #facet_grid(. ~ timepoint, scales = "free_x", space = "free") +
  coord_cartesian(ylim = c(-0.8, 0.8)) +
  xlab("") +
  ylab("LAD difference") +
  scale_fill_manual(values = c("blue", "grey50", "red"),
                    name = "Class") +
  theme_bw() +
  theme(aspect.ratio = 1,
          axis.text.x = element_text(angle = 90, hjust = 1))

# Calculate statistics for illustrator (lazy approach)
tib_pvalues <- tibble()

for (tmp_key in unique(tib_combined$test)) {
  
  tmp <- tib_combined %>%
    filter(test == tmp_key)
  
  for (tmp_sign in c("up", "down")) {
    
    test <- wilcox.test(tmp$diff[tmp$class == "stable"], 
                        tmp$diff[tmp$class == tmp_sign],
                        conf.int = TRUE)
    
    tib_pvalues <- bind_rows(tib_pvalues,
                             tibble(key = tmp_key, 
                                    class = tmp_sign,
                                    n_sign = sum(tmp$class == tmp_sign),
                                    pvalue = test$p.value,
                                    direction = ifelse(test$estimate > 0, 
                                                       "up", "down")))
    
  }
}

# Multiple testing 
tib_pvalues %>%
  mutate(padj = p.adjust(pvalue),
         sign = padj < 0.05) %>%
  print(n = 40)
## # A tibble: 4 × 7
##   key      class n_sign pvalue direction  padj sign 
##   <chr>    <chr>  <int>  <dbl> <chr>     <dbl> <lgl>
## 1 WAPL_6h  up        11  0.134 up        0.425 FALSE
## 2 WAPL_6h  down       9  0.106 up        0.425 FALSE
## 3 WAPL_24h up        73  0.129 up        0.425 FALSE
## 4 WAPL_24h down      24  0.607 up        0.607 FALSE

These results show that differential genes are mostly not correlated with LAD changes. This is roughly the same message as written in the manuscript right now, but is slightly more accurate (and easy to interpret) in my opinion. Also, it’s easier to include this result in the story.

x. Save data

saveRDS(genes.damid, 
        file.path(output.dir, "genes_damid.rds"))

Conclusion

Good. Conclusions:

  • Overall, there is a negative correlation between changes in gene expression and changes in lamina positioning. This is normal. The effect size is very small. It seems that other things result in stronger disruptions of the LAD pattern.
  • The presence of differentially expressed genes does not explain the LAD behaviour in going up / down. This is good, as it suggests a direct effect of CTCF / cohesin on the LAD pattern.

SessionInfo

sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.33           GGally_2.1.2         yaml_2.2.1          
##  [4] caTools_1.18.2       rtracklayer_1.50.0   GenomicRanges_1.42.0
##  [7] GenomeInfoDb_1.26.7  IRanges_2.24.1       S4Vectors_0.28.1    
## [10] BiocGenerics_0.36.1  forcats_0.5.1        stringr_1.4.0       
## [13] dplyr_1.0.7          purrr_0.3.4          readr_2.0.0         
## [16] tidyr_1.1.3          tibble_3.1.6         ggplot2_3.3.5       
## [19] tidyverse_1.3.1     
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7                matrixStats_0.60.0         
##  [3] fs_1.5.0                    lubridate_1.7.10           
##  [5] RColorBrewer_1.1-2          httr_1.4.2                 
##  [7] tools_4.0.5                 backports_1.2.1            
##  [9] bslib_0.2.5.1               utf8_1.2.2                 
## [11] R6_2.5.1                    DBI_1.1.1                  
## [13] colorspace_2.0-2            withr_2.4.2                
## [15] tidyselect_1.1.1            compiler_4.0.5             
## [17] cli_3.1.0                   rvest_1.0.1                
## [19] Biobase_2.50.0              xml2_1.3.2                 
## [21] DelayedArray_0.16.3         labeling_0.4.2             
## [23] sass_0.4.0                  scales_1.1.1               
## [25] digest_0.6.28               Rsamtools_2.6.0            
## [27] rmarkdown_2.11              XVector_0.30.0             
## [29] pkgconfig_2.0.3             htmltools_0.5.1.1          
## [31] MatrixGenerics_1.2.1        highr_0.9                  
## [33] dbplyr_2.1.1                rlang_0.4.12               
## [35] readxl_1.3.1                rstudioapi_0.13            
## [37] farver_2.1.0                jquerylib_0.1.4            
## [39] generics_0.1.0              jsonlite_1.7.2             
## [41] BiocParallel_1.24.1         RCurl_1.98-1.3             
## [43] magrittr_2.0.1              GenomeInfoDbData_1.2.4     
## [45] Matrix_1.3-4                Rcpp_1.0.7                 
## [47] munsell_0.5.0               fansi_0.5.0                
## [49] lifecycle_1.0.1             stringi_1.7.3              
## [51] SummarizedExperiment_1.20.0 zlibbioc_1.36.0            
## [53] plyr_1.8.6                  grid_4.0.5                 
## [55] crayon_1.4.2                lattice_0.20-44            
## [57] Biostrings_2.58.0           haven_2.4.1                
## [59] hms_1.1.0                   pillar_1.6.4               
## [61] reprex_2.0.0                XML_3.99-0.6               
## [63] glue_1.5.0                  evaluate_0.14              
## [65] modelr_0.1.8                vctrs_0.3.8                
## [67] tzdb_0.1.2                  cellranger_1.1.0           
## [69] gtable_0.3.0                reshape_0.8.8              
## [71] assertthat_0.2.1            xfun_0.24                  
## [73] broom_0.7.9                 GenomicAlignments_1.26.0   
## [75] ellipsis_0.3.2